import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
my_libs_dir = '../'
if not my_libs_dir in sys.path:
sys.path.append(my_libs_dir) # add the path to my_lib directory
# The following lines are needed to auto-reload my library file
# Without these lines, my library file is read only once and
# modifications of my library file are not reflected.
%load_ext autoreload
%autoreload 1
%aimport my_libs.linear_reg
# import from my library file
from my_libs.linear_reg import step_aic_forward, calc_vifs
%config InlineBackend.figure_formats = {'png', 'retina'} #high-reso images
plt.rcParams['font.family'] = 'Yu Mincho' # for Japanese in graph (Win10)
# To show all rows and columns in the results
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
目的変数に影響を与えていそうな要因は、可能な限り網羅的に説明変数に取り入れる。
encoding='shift-jis' may be needed.
CSVファイルをチェックしてから読み込む。必要に応じて列ラベルを変更。
CSVファイルの漢字コードがShift-JISの場合は encoding='shift-jis' が必要。
csv_in = 'winequality-red_modified-utf8.txt'
df_all = pd.read_csv(csv_in, delimiter='\s+', skiprows=13, header=0)
print(df_all.shape)
print(df_all.info())
display(df_all.head())
(1599, 12) <class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed_acidity 1599 non-null float64 1 volatile_acidity 1599 non-null float64 2 citric_acid 1599 non-null float64 3 residual_sugar 1599 non-null float64 4 chlorides 1598 non-null float64 5 free_sulfur_dioxide 1599 non-null float64 6 total_sulfur_dioxide 1599 non-null float64 7 density 1597 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB None
| fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | NaN | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | NaN | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | NaN | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
数値列・カテゴリー列の様子をみる
display(df_all.describe())
#display(df_all.describe(exclude='number')) # no category value columns
| fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1598.000000 | 1599.000000 | 1599.000000 | 1597.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087474 | 15.874922 | 46.467792 | 0.996746 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047079 | 10.460157 | 32.895324 | 0.001888 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997830 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
欠損値を含む行を表示してみる
display(df_all[df_all.isnull().any(axis=1)])
| fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | NaN | 3.20 | 0.68 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | NaN | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | NaN | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
欠損値を含む行を削除する
df_all = df_all.dropna().reset_index(drop=True)
print(df_all.shape)
display(df_all.head())
(1596, 12)
| fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 2 | 7.4 | 0.66 | 0.00 | 1.8 | 0.075 | 13.0 | 40.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 3 | 7.9 | 0.60 | 0.06 | 1.6 | 0.069 | 15.0 | 59.0 | 0.9964 | 3.30 | 0.46 | 9.4 | 5 |
| 4 | 7.3 | 0.65 | 0.00 | 1.2 | 0.065 | 15.0 | 21.0 | 0.9946 | 3.39 | 0.47 | 10.0 | 7 |
Answer 1: #data = 1596
説明変数と目的変数を分ける
X_all_org = df_all.loc[:, 'fixed_acidity':'alcohol'] # explanatory variables
y = df_all['quality'] # objective variable
print('X_all_org:', X_all_org.shape)
display(X_all_org.head())
print('y:', y.shape)
print(y.head())
X_all_org: (1596, 11)
| fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 |
| 1 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 |
| 2 | 7.4 | 0.66 | 0.00 | 1.8 | 0.075 | 13.0 | 40.0 | 0.9978 | 3.51 | 0.56 | 9.4 |
| 3 | 7.9 | 0.60 | 0.06 | 1.6 | 0.069 | 15.0 | 59.0 | 0.9964 | 3.30 | 0.46 | 9.4 |
| 4 | 7.3 | 0.65 | 0.00 | 1.2 | 0.065 | 15.0 | 21.0 | 0.9946 | 3.39 | 0.47 | 10.0 |
y: (1596,) 0 5 1 5 2 5 3 5 4 7 Name: quality, dtype: int64
ダミー変数化
# get_dummies is not needed
X_all = X_all_org.copy()
#X_all = pd.get_dummies(X_all_org, drop_first=True)
#print('X_all:', X_all.shape)
#display(X_all.head())
変数間の総当たり散布図を描画。相関係数も算出しておく
総当たりのPearson相関係数
corr_all = X_all.corr(method='pearson')
display(corr_all)
| fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| fixed_acidity | 1.000000 | -0.254796 | 0.671376 | 0.115231 | 0.094037 | -0.154094 | -0.113752 | 0.668356 | -0.683007 | 0.183527 | -0.061580 |
| volatile_acidity | -0.254796 | 1.000000 | -0.550864 | 0.001749 | 0.061071 | -0.011231 | 0.076463 | 0.022282 | 0.234924 | -0.261792 | -0.202022 |
| citric_acid | 0.671376 | -0.550864 | 1.000000 | 0.143933 | 0.204452 | -0.060859 | 0.035440 | 0.365623 | -0.542063 | 0.313457 | 0.109363 |
| residual_sugar | 0.115231 | 0.001749 | 0.143933 | 1.000000 | 0.055469 | 0.187006 | 0.203091 | 0.355759 | -0.085640 | 0.005230 | 0.041679 |
| chlorides | 0.094037 | 0.061071 | 0.204452 | 0.055469 | 1.000000 | 0.005389 | 0.047337 | 0.200882 | -0.265166 | 0.371164 | -0.221426 |
| free_sulfur_dioxide | -0.154094 | -0.011231 | -0.060859 | 0.187006 | 0.005389 | 1.000000 | 0.667541 | -0.021855 | 0.071304 | 0.051475 | -0.069386 |
| total_sulfur_dioxide | -0.113752 | 0.076463 | 0.035440 | 0.203091 | 0.047337 | 0.667541 | 1.000000 | 0.071252 | -0.065735 | 0.042895 | -0.205651 |
| density | 0.668356 | 0.022282 | 0.365623 | 0.355759 | 0.200882 | -0.021855 | 0.071252 | 1.000000 | -0.342146 | 0.148960 | -0.495957 |
| pH | -0.683007 | 0.234924 | -0.542063 | -0.085640 | -0.265166 | 0.071304 | -0.065735 | -0.342146 | 1.000000 | -0.196633 | 0.206091 |
| sulphates | 0.183527 | -0.261792 | 0.313457 | 0.005230 | 0.371164 | 0.051475 | 0.042895 | 0.148960 | -0.196633 | 1.000000 | 0.093188 |
| alcohol | -0.061580 | -0.202022 | 0.109363 | 0.041679 | -0.221426 | -0.069386 | -0.205651 | -0.495957 | 0.206091 | 0.093188 | 1.000000 |
相関係数の絶対値が大きい説明変数ペアの出力
th_corr = 0.3
keep = np.triu(np.ones(corr_all.shape), k=1).astype('bool').flatten()
triu = corr_all.stack()[keep]
triu_sorted = triu[ np.abs(triu).sort_values(ascending=False).index ]
print(triu_sorted[ (triu_sorted < -th_corr) | (triu_sorted > th_corr) ])
fixed_acidity pH -0.683007
citric_acid 0.671376
density 0.668356
free_sulfur_dioxide total_sulfur_dioxide 0.667541
volatile_acidity citric_acid -0.550864
citric_acid pH -0.542063
density alcohol -0.495957
chlorides sulphates 0.371164
citric_acid density 0.365623
residual_sugar density 0.355759
density pH -0.342146
citric_acid sulphates 0.313457
dtype: float64
変数間の総当たり散布図
sns.pairplot(X_all)
plt.show()
Heatmapを描いてもよい
plt.figure(figsize=(10,10))
sns.heatmap(corr_all,annot=True,fmt='.2f',cmap='bwr')
plt.show()
全説明変数を用いて、標準化なしで線形重回帰分析
X_all_c = sm.add_constant(X_all)
model = sm.OLS(y, X_all_c)
results = model.fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: quality R-squared: 0.360
Model: OLS Adj. R-squared: 0.356
Method: Least Squares F-statistic: 81.03
Date: Mon, 29 Mar 2021 Prob (F-statistic): 6.17e-145
Time: 16:02:37 Log-Likelihood: -1567.6
No. Observations: 1596 AIC: 3159.
Df Residuals: 1584 BIC: 3224.
Df Model: 11
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 21.6728 21.221 1.021 0.307 -19.951 63.297
fixed_acidity 0.0245 0.026 0.942 0.346 -0.026 0.075
volatile_acidity -1.0814 0.121 -8.914 0.000 -1.319 -0.843
citric_acid -0.1836 0.147 -1.246 0.213 -0.473 0.105
residual_sugar 0.0163 0.015 1.086 0.278 -0.013 0.046
chlorides -1.8757 0.420 -4.470 0.000 -2.699 -1.053
free_sulfur_dioxide 0.0044 0.002 2.012 0.044 0.000 0.009
total_sulfur_dioxide -0.0033 0.001 -4.485 0.000 -0.005 -0.002
density -17.5773 21.660 -0.812 0.417 -60.063 24.908
pH -0.4172 0.192 -2.173 0.030 -0.794 -0.041
sulphates 0.9176 0.114 8.016 0.000 0.693 1.142
alcohol 0.2766 0.027 10.430 0.000 0.225 0.329
==============================================================================
Omnibus: 27.070 Durbin-Watson: 1.757
Prob(Omnibus): 0.000 Jarque-Bera (JB): 40.336
Skew: -0.168 Prob(JB): 1.74e-09
Kurtosis: 3.703 Cond. No. 1.13e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.13e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
#help(results)
#print(dir(results)) # To know all methods/attributes of an object
決定係数や自由度調整済み決定係数をみて、そもそも線形モデルの当てはめが妥当かどうかを判断
print('R2:', results.rsquared)
print('Adj R2:', results.rsquared_adj)
R2: 0.3600832139208683 Adj R2: 0.3556393473508743
Answer 2: R2 = 0.360
Not so good ... but move on to the next step.
重回帰式の検定 (求めた重回帰式は目的変数を説明している?)
print('p-values (F-statistic)', results.f_pvalue)
p-values (F-statistic) 6.1687486357125506e-145
Very small p-value, so this MLR equation is considered to be significant.
Compare coefficients for explanatory variables
全説明変数と目的変数を標準化して線形重回帰分析
標準化偏回帰係数を比較
# NOTE: after scaling, X_scaled and Y_scaled are ndarray, not DataFrame.
X_scaled = preprocessing.scale(X_all)
y_scaled = preprocessing.scale(y)
model = sm.OLS(y_scaled, X_scaled)
results = model.fit()
print(results.summary())
OLS Regression Results
=======================================================================================
Dep. Variable: y R-squared (uncentered): 0.360
Model: OLS Adj. R-squared (uncentered): 0.356
Method: Least Squares F-statistic: 81.08
Date: Mon, 29 Mar 2021 Prob (F-statistic): 4.95e-145
Time: 16:02:37 Log-Likelihood: -1908.4
No. Observations: 1596 AIC: 3839.
Df Residuals: 1585 BIC: 3898.
Df Model: 11
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
x1 0.0528 0.056 0.942 0.346 -0.057 0.163
x2 -0.2394 0.027 -8.916 0.000 -0.292 -0.187
x3 -0.0442 0.035 -1.247 0.213 -0.114 0.025
x4 0.0285 0.026 1.087 0.277 -0.023 0.080
x5 -0.1094 0.024 -4.471 0.000 -0.157 -0.061
x6 0.0567 0.028 2.013 0.044 0.001 0.112
x7 -0.1333 0.030 -4.486 0.000 -0.192 -0.075
x8 -0.0411 0.051 -0.812 0.417 -0.140 0.058
x9 -0.0797 0.037 -2.174 0.030 -0.152 -0.008
x10 0.1927 0.024 8.019 0.000 0.146 0.240
x11 0.3650 0.035 10.433 0.000 0.296 0.434
==============================================================================
Omnibus: 27.070 Durbin-Watson: 1.757
Prob(Omnibus): 0.000 Jarque-Bera (JB): 40.336
Skew: -0.168 Prob(JB): 1.74e-09
Kurtosis: 3.703 Cond. No. 7.21
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
変数選択
# NOTE: make DataFrames corresponding to X_scaled and y_scaled.
dfX_scaled = pd.DataFrame(X_scaled, columns=X_all.columns)
dfy_scaled = pd.Series(y_scaled, name=y.name)
exog = list(dfX_scaled.columns) # Initial set = all explanatory variables
endog = [dfy_scaled.name] # Objective variables
df_scaled = pd.concat([dfX_scaled, dfy_scaled], axis=1)
変数増加法による変数選択
results_aic=step_aic_forward(smf.ols, exog, endog, data=df_scaled)
AIC: 4531.252, formula: quality ~ 1 AIC: 4529.172, formula: quality ~ free_sulfur_dioxide AIC: 4506.508, formula: quality ~ chlorides AIC: 4483.646, formula: quality ~ density AIC: 4123.148, formula: quality ~ alcohol AIC: 4528.011, formula: quality ~ pH AIC: 4532.954, formula: quality ~ residual_sugar AIC: 4270.337, formula: quality ~ volatile_acidity AIC: 4477.538, formula: quality ~ total_sulfur_dioxide AIC: 4428.999, formula: quality ~ sulphates AIC: 4450.253, formula: quality ~ citric_acid AIC: 4508.780, formula: quality ~ fixed_acidity AIC: 4124.514, formula: quality ~ alcohol + free_sulfur_dioxide AIC: 4123.949, formula: quality ~ alcohol + chlorides AIC: 4114.882, formula: quality ~ alcohol + density AIC: 4072.263, formula: quality ~ alcohol + pH AIC: 4125.070, formula: quality ~ alcohol + residual_sugar AIC: 3928.074, formula: quality ~ alcohol + volatile_acidity AIC: 4108.632, formula: quality ~ alcohol + total_sulfur_dioxide AIC: 4033.266, formula: quality ~ alcohol + sulphates AIC: 4061.342, formula: quality ~ alcohol + citric_acid AIC: 4076.123, formula: quality ~ alcohol + fixed_acidity AIC: 3928.580, formula: quality ~ alcohol + volatile_acidity + free_sulfur_dioxide AIC: 3929.233, formula: quality ~ alcohol + volatile_acidity + chlorides AIC: 3925.723, formula: quality ~ alcohol + volatile_acidity + density AIC: 3916.715, formula: quality ~ alcohol + volatile_acidity + pH AIC: 3930.052, formula: quality ~ alcohol + volatile_acidity + residual_sugar AIC: 3915.671, formula: quality ~ alcohol + volatile_acidity + total_sulfur_dioxide AIC: 3885.199, formula: quality ~ alcohol + volatile_acidity + sulphates AIC: 3929.652, formula: quality ~ alcohol + volatile_acidity + citric_acid AIC: 3917.297, formula: quality ~ alcohol + volatile_acidity + fixed_acidity AIC: 3884.626, formula: quality ~ alcohol + volatile_acidity + sulphates + free_sulfur_dioxide AIC: 3869.818, formula: quality ~ alcohol + volatile_acidity + sulphates + chlorides AIC: 3886.694, formula: quality ~ alcohol + volatile_acidity + sulphates + density AIC: 3880.187, formula: quality ~ alcohol + volatile_acidity + sulphates + pH AIC: 3887.167, formula: quality ~ alcohol + volatile_acidity + sulphates + residual_sugar AIC: 3868.077, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide AIC: 3886.594, formula: quality ~ alcohol + volatile_acidity + sulphates + citric_acid AIC: 3879.586, formula: quality ~ alcohol + volatile_acidity + sulphates + fixed_acidity AIC: 3866.897, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + free_sulfur_dioxide AIC: 3851.195, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides AIC: 3869.831, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + density AIC: 3862.384, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + pH AIC: 3869.453, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + residual_sugar AIC: 3869.892, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + citric_acid AIC: 3865.241, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + fixed_acidity AIC: 3850.249, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + free_sulfur_dioxide AIC: 3852.841, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + density AIC: 3839.219, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH AIC: 3851.875, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + residual_sugar AIC: 3852.835, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + citric_acid AIC: 3847.590, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + fixed_acidity AIC: 3835.476, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH + free_sulfur_dioxide AIC: 3841.109, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH + density AIC: 3840.532, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH + residual_sugar AIC: 3839.254, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH + citric_acid AIC: 3841.188, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH + fixed_acidity AIC: 3837.399, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH + free_sulfur_dioxide + density AIC: 3837.071, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH + free_sulfur_dioxide + residual_sugar AIC: 3836.304, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH + free_sulfur_dioxide + citric_acid AIC: 3837.455, formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH + free_sulfur_dioxide + fixed_acidity The best formula: quality ~ alcohol + volatile_acidity + sulphates + total_sulfur_dioxide + chlorides + pH + free_sulfur_dioxide Minimum AIC: 3835.476
print(results_aic.aic)
print(results_aic.model.exog_names)
print(results_aic.model.endog_names)
3835.4763012244853 ['Intercept', 'alcohol', 'volatile_acidity', 'sulphates', 'total_sulfur_dioxide', 'chlorides', 'pH', 'free_sulfur_dioxide'] quality
マルチコのチェック
Format of results of statsmodels: https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.RegressionResults.html
endogs = results_aic.model.endog_names
exogs = results_aic.model.exog_names.copy()
exogs.remove('Intercept')
#print(exogs) # debug
X_c = sm.add_constant(X_all[exogs])
vifs = calc_vifs(X_c, y)
display(vifs)
| VIF | |
|---|---|
| const | 618.539992 |
| alcohol | 1.220248 |
| volatile_acidity | 1.242579 |
| sulphates | 1.322258 |
| total_sulfur_dioxide | 1.943518 |
| chlorides | 1.333407 |
| pH | 1.255038 |
| free_sulfur_dioxide | 1.882888 |
How to eliminate multicolinearity is case by case.
Here we simply delete three explanatory variables with high VIF.
# all VIFs < 10, so removal of variables is not needed
#exogs.remove('w_all')
#exogs.remove('w_meat')
#exogs.remove('w_shell')
#print(exogs) # debug
#X_c = sm.add_constant(X_all[exogs])
#vifs = calc_vifs(X_c, y)
#display(vifs)
For all explantory variables, VIF < 10, so we can go forward.
最終的に得られた標準化偏回帰係数から、各説明変数の目的変数に対する影響の大きさがわかる
X_final_scaled = dfX_scaled[exogs]
model_final_scaled = sm.OLS(y_scaled, X_final_scaled)
results_final_scaled = model_final_scaled.fit()
print(results_final_scaled.summary())
OLS Regression Results
=======================================================================================
Dep. Variable: y R-squared (uncentered): 0.359
Model: OLS Adj. R-squared (uncentered): 0.356
Method: Least Squares F-statistic: 127.1
Date: Mon, 29 Mar 2021 Prob (F-statistic): 1.48e-148
Time: 16:02:38 Log-Likelihood: -1909.7
No. Observations: 1596 AIC: 3833.
Df Residuals: 1589 BIC: 3871.
Df Model: 7
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
alcohol 0.3819 0.022 17.212 0.000 0.338 0.425
volatile_acidity -0.2235 0.022 -9.984 0.000 -0.267 -0.180
sulphates 0.1856 0.023 8.036 0.000 0.140 0.231
total_sulfur_dioxide -0.1421 0.028 -5.074 0.000 -0.197 -0.087
chlorides -0.1177 0.023 -5.073 0.000 -0.163 -0.072
pH -0.0922 0.023 -4.097 0.000 -0.136 -0.048
free_sulfur_dioxide 0.0660 0.028 2.393 0.017 0.012 0.120
==============================================================================
Omnibus: 23.950 Durbin-Watson: 1.750
Prob(Omnibus): 0.000 Jarque-Bera (JB): 34.738
Skew: -0.156 Prob(JB): 2.86e-08
Kurtosis: 3.652 Cond. No. 2.44
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(results_final_scaled.params)
alcohol 0.381874 volatile_acidity -0.223534 sulphates 0.185601 total_sulfur_dioxide -0.142075 chlorides -0.117664 pH -0.092192 free_sulfur_dioxide 0.065957 dtype: float64
The order of strengths of influences on objective variables:
**Answer 3: alcohol (positive) > volatile_acidity (negative) > sulphates (positive) > total_sulfur_dioxide (negative)
> chlorides (negative) > pH ( negative) > free_sulfur_dioxide (positive)**
(これが、目的変数qualityに対する各説明変数の影響の大きさ順)
重回帰式の検定 (求めた重回帰式は目的変数を説明している?)
print('p-values (F-statistic)', results_final_scaled.f_pvalue)
p-values (F-statistic) 1.4839271358928643e-148
Very small p-value, so this MLR equation is considered to be significant.
選択された説明変数を用いて、標準化なしで線形重回帰分析
X_final_c = sm.add_constant(X_all[exogs])
model_final = sm.OLS(y, X_final_c)
results_final = model_final.fit()
print(results_final.summary())
OLS Regression Results
==============================================================================
Dep. Variable: quality R-squared: 0.359
Model: OLS Adj. R-squared: 0.356
Method: Least Squares F-statistic: 127.1
Date: Mon, 29 Mar 2021 Prob (F-statistic): 1.85e-148
Time: 16:02:38 Log-Likelihood: -1568.9
No. Observations: 1596 AIC: 3154.
Df Residuals: 1588 BIC: 3197.
Df Model: 7
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 4.4260 0.404 10.967 0.000 3.634 5.218
alcohol 0.2894 0.017 17.206 0.000 0.256 0.322
volatile_acidity -1.0098 0.101 -9.981 0.000 -1.208 -0.811
sulphates 0.8840 0.110 8.034 0.000 0.668 1.100
total_sulfur_dioxide -0.0035 0.001 -5.072 0.000 -0.005 -0.002
chlorides -2.0181 0.398 -5.072 0.000 -2.799 -1.238
pH -0.4825 0.118 -4.096 0.000 -0.714 -0.251
free_sulfur_dioxide 0.0051 0.002 2.392 0.017 0.001 0.009
==============================================================================
Omnibus: 23.950 Durbin-Watson: 1.750
Prob(Omnibus): 0.000 Jarque-Bera (JB): 34.738
Skew: -0.156 Prob(JB): 2.86e-08
Kurtosis: 3.652 Cond. No. 1.70e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.7e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
決定係数や自由度調整済み決定係数をみて、線形モデルの当てはまりの良さをチェック
print('R2:', results_final.rsquared)
print('Adj R2:', results_final.rsquared_adj)
R2: 0.3589972226304371 Adj R2: 0.3561716436369945
Answer 4: 0.356
The fit of "the best model" is not good ...
最終的に得られた偏回帰係数から、「各説明変数が1増えたときの目的変数の増分」がわかる。
print(results_final.params)
const 4.426027 alcohol 0.289401 volatile_acidity -1.009819 sulphates 0.884003 total_sulfur_dioxide -0.003487 chlorides -2.018132 pH -0.482495 free_sulfur_dioxide 0.005091 dtype: float64
Coefficients of MLR;
Increment of objective variable when corresponding variable is increased by 1
and other variables are not changed
Answer 5: quality $\sim$ 4.426 + 0.289 alcohol + (-1.010) volatile_acidity + 0.884 sulphates +
(-0.003) total_sulfur_dioxide + (-2.018) chlorides + (-0.482) pH + (0.005) * free_sulfur_dioxide
Answer 6: -0.482